setwd("C:/Workshop/Data")
policies <- read.csv("Rates.csv")
head(policies)
## Gender State State.Rate Height Weight BMI Age Rate
## 1 Male MA 0.10043368 184 67.8 20.02599 77 0.33200000
## 2 Male VA 0.14172319 163 89.4 33.64824 82 0.86914779
## 3 Male NY 0.09080315 170 81.2 28.09689 31 0.01000000
## 4 Male TN 0.11997276 175 99.7 32.55510 39 0.02153204
## 5 Male FL 0.11034460 184 72.1 21.29608 68 0.14975000
## 6 Male WA 0.16292470 166 98.4 35.70910 64 0.21123703
summary(policies)
## Gender State State.Rate Height
## Female:956 CA : 191 Min. :0.0010 Min. :150.0
## Male :986 TX : 169 1st Qu.:0.1103 1st Qu.:162.0
## FL : 104 Median :0.1276 Median :170.0
## NY : 94 Mean :0.1381 Mean :169.7
## IL : 80 3rd Qu.:0.1443 3rd Qu.:176.0
## OH : 77 Max. :0.3181 Max. :190.0
## (Other):1227
## Weight BMI Age Rate
## Min. : 44.10 Min. :16.02 Min. :18.00 Min. :0.00100
## 1st Qu.: 68.60 1st Qu.:23.74 1st Qu.:34.00 1st Qu.:0.01475
## Median : 81.30 Median :28.06 Median :51.00 Median :0.04628
## Mean : 81.16 Mean :28.29 Mean :50.84 Mean :0.13806
## 3rd Qu.: 93.80 3rd Qu.:32.46 3rd Qu.:68.00 3rd Qu.:0.17269
## Max. :116.50 Max. :46.80 Max. :84.00 Max. :0.99900
##
library(RColorBrewer)
palette <- brewer.pal(9, "Reds")
plot(
x = policies,
col = palette[cut(
x = policies$Rate,
breaks = 9)])
library(corrgram)
corrgram(policies)
cor(policies[3:8])
## State.Rate Height Weight BMI Age
## State.Rate 1.000000000 -0.01652294 0.009233267 0.01924141 0.11234748
## Height -0.016522938 1.00000000 0.238085304 -0.31696110 -0.16478131
## Weight 0.009233267 0.23808530 1.000000000 0.83962760 0.01167918
## BMI 0.019241409 -0.31696110 0.839627602 1.00000000 0.10231657
## Age 0.112347476 -0.16478131 0.011679178 0.10231657 1.00000000
## Rate 0.226852143 -0.12858150 0.060939196 0.14050657 0.78007905
## Rate
## State.Rate 0.2268521
## Height -0.1285815
## Weight 0.0609392
## BMI 0.1405066
## Age 0.7800790
## Rate 1.0000000
Question: Which variable is most strongly correlated with mortality rate?
Get the correlation for age and rate.
cor(
x = policies$Age,
y = policies$Rate)
## [1] 0.780079
plot(
x = policies$Age,
y = policies$Rate)
set.seed(42)
library(caret)
indexes <- createDataPartition(
y = policies$Rate,
p = 0.80,
list = FALSE)
train <- policies[indexes, ]
test <- policies[-indexes, ]
print(nrow(train))
## [1] 1555
print(nrow(test))
## [1] 387
simpleModel <- lm(
formula = Rate ~ Age,
data = train)
plot(
x = policies$Age,
y = policies$Rate)
lines(
x = train$Age,
y = simpleModel$fitted,
col = "red",
lwd = 3)
summary(simpleModel)
##
## Call:
## lm(formula = Rate ~ Age, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18237 -0.09092 -0.02208 0.06002 0.62697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2650115 0.0087679 -30.23 <2e-16 ***
## Age 0.0079630 0.0001609 49.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1236 on 1553 degrees of freedom
## Multiple R-squared: 0.6121, Adjusted R-squared: 0.6118
## F-statistic: 2450 on 1 and 1553 DF, p-value: < 2.2e-16
simplePredictions <- predict(
object = simpleModel,
newdata = test)
plot(
x = policies$Age,
y = policies$Rate)
points(
x = test$Age,
y = simplePredictions,
col = "blue",
pch = 4,
lwd = 2)
simpleRMSE <- sqrt(mean((test$Rate - simplePredictions)^2))
print(simpleRMSE)
## [1] 0.1148266
multipleModel <- lm(
formula = Rate ~ Age + Gender + State.Rate + BMI,
data = train)
summary(multipleModel)
##
## Call:
## lm(formula = Rate ~ Age + Gender + State.Rate + BMI, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24620 -0.08738 -0.02936 0.05979 0.60437
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4233471 0.0189843 -22.300 < 2e-16 ***
## Age 0.0077309 0.0001561 49.538 < 2e-16 ***
## GenderMale 0.0355968 0.0060606 5.873 5.21e-09 ***
## State.Rate 0.6258740 0.0686779 9.113 < 2e-16 ***
## BMI 0.0023340 0.0005240 4.454 9.03e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1188 on 1550 degrees of freedom
## Multiple R-squared: 0.6424, Adjusted R-squared: 0.6415
## F-statistic: 696.3 on 4 and 1550 DF, p-value: < 2.2e-16
multiplePredictions <- predict(
object = multipleModel,
newdata = test)
plot(
x = policies$Age,
y = policies$Rate)
points(
x = test$Age,
y = multiplePredictions,
col = "blue",
pch = 4,
lwd = 2)
multipleRMSE <- sqrt(mean((test$Rate - multiplePredictions)^2))
print(multipleRMSE)
## [1] 0.1102457
normalize <- function(x) {
(x - min(x)) / (max(x) - min(x)) - 0.5
}
denormalize <- function(x, y) {
((x + 0.5) * (max(y) - min(y))) + min(y)
}
scaledPolicies <- data.frame(
Gender = policies$Gender,
State.Rate = normalize(policies$State.Rate),
BMI = normalize(policies$BMI),
Age = normalize(policies$Age),
Rate = normalize(policies$Rate))
scaledTrain <- scaledPolicies[indexes, ]
scaledTest <- scaledPolicies[-indexes, ]
library(nnet)
neuralRegressor <- nnet(
formula = Rate ~ .,
data = scaledTrain,
linout = TRUE,
size = 5,
decay = 0.0001,
maxit = 1000)
scaledPredictions <- predict(
object = neuralRegressor,
newdata = scaledTest)
neuralPredictions <- denormalize(
x = scaledPredictions,
y = policies$Rate)
plot(
x = train$Age,
y = train$Rate)
points(
x = test$Age,
y = neuralPredictions,
col = "blue",
pch = 4,
lwd = 2)
library(NeuralNetTools)
plotnet(neuralRegressor)
neuralRMSE <- sqrt(mean((test$Rate - neuralPredictions)^2))
print(neuralRMSE)
## [1] 0.039926
print(simpleRMSE)
## [1] 0.1148266
print(multipleRMSE)
## [1] 0.1102457
print(neuralRMSE)
## [1] 0.039926